{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import model_selection\n",
    "from sklearn.linear_model import Ridge,RidgeCV\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 读取糖尿病数据集\n",
    "diabetes = pd.read_excel(r'C:\\Users\\Administrator\\Desktop\\diabetes.xlsx', sep = '')\n",
    "# 构造自变量（剔除患者性别、年龄和因变量）\n",
    "predictors = diabetes.columns[2:-1]\n",
    "# 将数据集拆分为训练集和测试集\n",
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(diabetes[predictors], diabetes['Y'], \n",
    "                                                                    test_size = 0.2, random_state = 1234 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构造不同的Lambda值\n",
    "Lambdas = np.logspace(-5, 2, 200)\n",
    "# 构造空列表，用于存储模型的偏回归系数\n",
    "ridge_cofficients = []\n",
    "# 循环迭代不同的Lambda值\n",
    "for Lambda in Lambdas:\n",
    "    ridge = Ridge(alpha = Lambda, normalize=True)\n",
    "    ridge.fit(X_train, y_train)\n",
    "    ridge_cofficients.append(ridge.coef_)\n",
    "\n",
    "# 绘制Lambda与回归系数的关系\n",
    "# 中文乱码和坐标轴负号的处理\n",
    "plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "# 设置绘图风格\n",
    "plt.style.use('ggplot')\n",
    "plt.plot(Lambdas, ridge_cofficients)\n",
    "# 对x轴作对数变换\n",
    "plt.xscale('log')\n",
    "# 设置折线图x轴和y轴标签\n",
    "plt.xlabel('Lambda')\n",
    "plt.ylabel('Cofficients')\n",
    "# 图形显示\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 岭回归模型的交叉验证\n",
    "# 设置交叉验证的参数，对于每一个Lambda值，都执行10重交叉验证\n",
    "ridge_cv = RidgeCV(alphas = Lambdas, normalize=True, scoring='neg_mean_squared_error', cv = 10)\n",
    "# 模型拟合\n",
    "ridge_cv.fit(X_train, y_train)\n",
    "# 返回最佳的lambda值\n",
    "ridge_best_Lambda = ridge_cv.alpha_\n",
    "ridge_best_Lambda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包中的函数\n",
    "from sklearn.metrics import mean_squared_error\n",
    "# 基于最佳的Lambda值建模\n",
    "ridge = Ridge(alpha = ridge_best_alpha, normalize=True)\n",
    "ridge.fit(X_train, y_train)\n",
    "# 返回岭回归系数\n",
    "pd.Series(index = ['Intercept'] + X_train.columns.tolist(),data = [ridge.intercept_] + ridge.coef_.tolist())\n",
    "# 预测\n",
    "ridge_predict = ridge.predict(X_test)\n",
    "# 预测效果验证\n",
    "RMSE = np.sqrt(mean_squared_error(y_test,ridge_predict))\n",
    "RMSE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块中的函数\n",
    "from sklearn.linear_model import Lasso,LassoCV\n",
    "# 构造空列表，用于存储模型的偏回归系数\n",
    "lasso_cofficients = []\n",
    "for Lambda in Lambdas:\n",
    "    lasso = Lasso(alpha = Lambda, normalize=True, max_iter=10000)\n",
    "    lasso.fit(X_train, y_train)\n",
    "    lasso_cofficients.append(lasso.coef_)\n",
    "\n",
    "# 绘制Lambda与回归系数的关系\n",
    "plt.plot(Lambdas, lasso_cofficients)\n",
    "# 对x轴作对数变换\n",
    "plt.xscale('log')\n",
    "# 设置折线图x轴和y轴标签\n",
    "plt.xlabel('Lambda')\n",
    "plt.ylabel('Cofficients')\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# LASSO回归模型的交叉验证\n",
    "lasso_cv = LassoCV(alphas = Lambdas, normalize=True, cv = 10, max_iter=10000)\n",
    "lasso_cv.fit(X_train, y_train)\n",
    "# 输出最佳的lambda值\n",
    "lasso_best_alpha = lasso_cv.alpha_\n",
    "lasso_best_alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 基于最佳的lambda值建模\n",
    "lasso = Lasso(alpha = lasso_best_alpha, normalize=True, max_iter=10000)\n",
    "lasso.fit(X_train, y_train)\n",
    "# 返回LASSO回归的系数\n",
    "pd.Series(index = ['Intercept'] + X_train.columns.tolist(),data = [lasso.intercept_] + lasso.coef_.tolist())\n",
    "\n",
    "# 预测\n",
    "lasso_predict = lasso.predict(X_test)\n",
    "# 预测效果验证\n",
    "RMSE = np.sqrt(mean_squared_error(y_test,lasso_predict))\n",
    "RMSE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from statsmodels import api as sms\n",
    "# 为自变量X添加常数列1，用于拟合截距项\n",
    "X_train2 = sms.add_constant(X_train)\n",
    "X_test2 = sms.add_constant(X_test)\n",
    "\n",
    "# 构建多元线性回归模型\n",
    "linear = sms.formula.OLS(y_train, X_train2).fit()\n",
    "# 返回线性回归模型的系数\n",
    "linear.params\n",
    "\n",
    "# 模型的预测\n",
    "linear_predict = linear.predict(X_test2)\n",
    "# 预测效果验证\n",
    "RMSE = np.sqrt(mean_squared_error(y_test,linear_predict))\n",
    "RMSE"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
